From 5e2bad938a65bd8ad4fcb40234771ddfefadd778 Mon Sep 17 00:00:00 2001 From: "kaf24@camelot.eng.3leafnetworks.com" Date: Sat, 4 Sep 2004 19:58:36 +0000 Subject: [PATCH] bitkeeper revision 1.1159.69.10 (413a1e6ckNNgvyiZ6JU5_vjN5ITSuA) Add pinning of L1 table sback in, as it helps 2.4 performance. At the same time I've fixed the 'mutable backptr' support in Xen. --- xen/arch/x86/domain.c | 106 ++++++++++++------ xen/arch/x86/memory.c | 63 ++++++----- xen/common/dom0_ops.c | 2 +- xen/common/dom_mem_ops.c | 2 +- xen/common/page_alloc.c | 6 +- xen/common/schedule.c | 14 --- xen/include/asm-x86/mm.h | 121 +++++++-------------- xen/include/hypervisor-ifs/dom0_ops.h | 2 +- xen/include/hypervisor-ifs/hypervisor-if.h | 20 ++-- 9 files changed, 163 insertions(+), 173 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 478daea09a..fc8b4847dc 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -449,51 +449,47 @@ long do_iopl(domid_t domain, unsigned int new_io_pl) #endif -void domain_relinquish_memory(struct domain *d) + +static void relinquish_list(struct domain *d, struct list_head *list) { - struct list_head *ent, *tmp; + struct list_head *ent; struct pfn_info *page; unsigned long x, y; - /* Ensure that noone is running over the dead domain's page tables. */ - synchronise_pagetables(~0UL); - - /* Exit shadow mode before deconstructing final guest page table. */ - shadow_mode_disable(d); - - /* Drop the in-use reference to the page-table base. */ - if ( pagetable_val(d->mm.pagetable) != 0 ) - put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >> - PAGE_SHIFT]); - - /* - * Relinquish GDT mappings. No need for explicit unmapping of the LDT as - * it automatically gets squashed when the guest's mappings go away. - */ - destroy_gdt(d); - /* Use a recursive lock, as we may enter 'free_domheap_page'. */ spin_lock_recursive(&d->page_alloc_lock); - /* Relinquish Xen-heap pages. */ - list_for_each_safe ( ent, tmp, &d->xenpage_list ) - { - page = list_entry(ent, struct pfn_info, list); - - if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) - put_page(page); - } - - /* Relinquish all pages on the domain's allocation list. */ - list_for_each_safe ( ent, tmp, &d->page_list ) + /* + * Careful! Any time we might decrement a page's reference count we + * might invalidate our page pointer or our pointer into the page list. + * In such cases we have to exit the current iteration of the loop and + * start back at the beginning of the list. We are guaranteed to make + * forward progress because nothign will get added to the list (the domain + * is dying) and no pages will become pinned after we unpin them. + */ + ent = list->next; + while ( ent != list ) { page = list_entry(ent, struct pfn_info, list); - if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_info) ) + if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) + { + /* NB. Check the allocation pin /before/ put_page_and_type()! */ + if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) + put_page(page); put_page_and_type(page); + /* May have lost our place in the list - start over. */ + ent = list->next; + continue; + } if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) + { put_page(page); + /* May have lost our place in the list - start over. */ + ent = list->next; + continue; + } /* * Forcibly invalidate base page tables at this point to break circular @@ -506,15 +502,61 @@ void domain_relinquish_memory(struct domain *d) x = y; if ( likely((x & (PGT_type_mask|PGT_validated)) != (PGT_base_page_table|PGT_validated)) ) + { + /* + * We have done no work on this iteration, so it is safe + * to move on to the next page in the list. + */ + ent = ent->next; break; + } y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated); if ( likely(y == x) ) + { free_page_type(page, PGT_base_page_table); + /* May have lost our place in the list - start over. */ + ent = list->next; + } } while ( unlikely(y != x) ); } spin_unlock_recursive(&d->page_alloc_lock); + + /* + * Another CPU may have raced us to free some pages. Wait for those + * to trickle out now that we have released the lock. + */ + while ( !list_empty(list) ) + { + smp_mb(); + cpu_relax(); + } +} + + +void domain_relinquish_memory(struct domain *d) +{ + /* Ensure that noone is running over the dead domain's page tables. */ + synchronise_pagetables(~0UL); + + /* Exit shadow mode before deconstructing final guest page table. */ + shadow_mode_disable(d); + + /* Drop the in-use reference to the page-table base. */ + if ( pagetable_val(d->mm.pagetable) != 0 ) + put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >> + PAGE_SHIFT]); + + /* + * Relinquish GDT mappings. No need for explicit unmapping of the LDT as + * it automatically gets squashed when the guest's mappings go away. + */ + destroy_gdt(d); + + /* Relinquish every page of memory. */ + relinquish_list(d, &d->xenpage_list); + relinquish_list(d, &d->page_list); } @@ -739,7 +781,7 @@ int construct_dom0(struct domain *p, /* Get another ref to L2 page so that it can be pinned. */ if ( !get_page_and_type(page, p, PGT_l2_page_table) ) BUG(); - set_bit(_PGC_guest_pinned, &page->count_info); + set_bit(_PGT_pinned, &page->u.inuse.type_info); } else { diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c index baeadbe310..509e8346fc 100644 --- a/xen/arch/x86/memory.c +++ b/xen/arch/x86/memory.c @@ -455,7 +455,8 @@ get_page_from_l1e( /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ static int get_page_from_l2e( - l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned long va_idx) + l2_pgentry_t l2e, unsigned long pfn, + struct domain *d, unsigned long va_idx) { int rc; @@ -471,7 +472,7 @@ get_page_from_l2e( rc = get_page_and_type_from_pagenr( l2_pgentry_to_pagenr(l2e), - PGT_l1_page_table | (va_idx<> 2 )) ) + ((unsigned long)pl2e & + ~PAGE_MASK) >> 2)) ) return 0; if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) @@ -828,18 +829,15 @@ static int do_extended_command(unsigned long ptr, unsigned long val) { case MMUEXT_PIN_L1_TABLE: case MMUEXT_PIN_L2_TABLE: - - /* When we pin an L1 page we now insist that the va - backpointer (used for writable page tables) must still be - mutable. This is an additional restriction even for guests - that don't use writable page tables, but I don't think it - will break anything as guests typically pin pages before - they are used, hence they'll still be mutable. */ - + /* + * We insist that, if you pin an L1 page, it's the first thing that + * you do to it. This is because we require the backptr to still be + * mutable. This assumption seems safe. + */ okay = get_page_and_type_from_pagenr( pfn, ((cmd==MMUEXT_PIN_L2_TABLE) ? - PGT_l2_page_table : (PGT_l1_page_table | PGT_va_mutable) ) , + PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)), FOREIGNDOM); if ( unlikely(!okay) ) @@ -849,8 +847,8 @@ static int do_extended_command(unsigned long ptr, unsigned long val) break; } - if ( unlikely(test_and_set_bit(_PGC_guest_pinned, - &page->count_info)) ) + if ( unlikely(test_and_set_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) { MEM_LOG("Pfn %08lx already pinned", pfn); put_page_and_type(page); @@ -866,8 +864,8 @@ static int do_extended_command(unsigned long ptr, unsigned long val) MEM_LOG("Page %08lx bad domain (dom=%p)", ptr, page->u.inuse.domain); } - else if ( likely(test_and_clear_bit(_PGC_guest_pinned, - &page->count_info)) ) + else if ( likely(test_and_clear_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) { put_page_and_type(page); put_page(page); @@ -1053,13 +1051,18 @@ static int do_extended_command(unsigned long ptr, unsigned long val) spin_lock(&e->page_alloc_lock); - /* Check that 'e' will accept the page and has reservation headroom. */ + /* + * Check that 'e' will accept the page and has reservation headroom. + * Also, a domain mustn't have PGC_allocated pages when it is dying. + */ ASSERT(e->tot_pages <= e->max_pages); - if ( unlikely(e->tot_pages == e->max_pages) || + if ( unlikely(test_bit(DF_DYING, &e->flags)) || + unlikely(e->tot_pages == e->max_pages) || unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) { MEM_LOG("Transferee has no reservation headroom (%d,%d), or " - "provided a bad grant ref.\n", e->tot_pages, e->max_pages); + "provided a bad grant ref, or is dying (%08x).\n", + e->tot_pages, e->max_pages, e->flags); spin_unlock(&e->page_alloc_lock); put_domain(e); okay = 0; @@ -1195,6 +1198,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count) unsigned long prev_spfn = 0; l1_pgentry_t *prev_spl1e = 0; struct domain *d = current; + u32 type_info; perfc_incrc(calls_to_mmu_update); perfc_addc(num_page_updates, count); @@ -1243,10 +1247,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count) } page = &frame_table[pfn]; - switch ( (page->u.inuse.type_info & PGT_type_mask) ) + switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) { case PGT_l1_page_table: - if ( likely(passive_get_page_type(page, PGT_l1_page_table)) ) + if ( likely(get_page_type( + page, type_info & (PGT_type_mask|PGT_va_mask))) ) { okay = mod_l1_entry((l1_pgentry_t *)va, mk_l1_pgentry(req.val)); @@ -1496,11 +1501,11 @@ void ptwr_reconnect_disconnected(unsigned long addr) [ptwr_info[cpu].writable_l1>>PAGE_SHIFT]; #ifdef PTWR_TRACK_DOMAIN - if (ptwr_domain[cpu] != get_current()->domain) + if (ptwr_domain[cpu] != current->domain) printk("ptwr_reconnect_disconnected domain mismatch %d != %d\n", - ptwr_domain[cpu], get_current()->domain); + ptwr_domain[cpu], current->domain); #endif - PTWR_PRINTK(("[A] page fault in disconnected space: addr %08lx space %08lx\n", + PTWR_PRINTK(("[A] page fault in disconn space: addr %08lx space %08lx\n", addr, ptwr_info[cpu].disconnected << L2_PAGETABLE_SHIFT)); pl2e = &linear_l2_table[ptwr_info[cpu].disconnected]; @@ -1572,9 +1577,9 @@ void ptwr_flush_inactive(void) int i, idx; #ifdef PTWR_TRACK_DOMAIN - if (ptwr_info[cpu].domain != get_current()->domain) + if (ptwr_info[cpu].domain != current->domain) printk("ptwr_flush_inactive domain mismatch %d != %d\n", - ptwr_info[cpu].domain, get_current()->domain); + ptwr_info[cpu].domain, current->domain); #endif #if 0 { @@ -1655,9 +1660,9 @@ int ptwr_do_page_fault(unsigned long addr) if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) { #ifdef PTWR_TRACK_DOMAIN - if ( ptwr_info[cpu].domain != get_current()->domain ) + if ( ptwr_info[cpu].domain != current->domain ) printk("ptwr_do_page_fault domain mismatch %d != %d\n", - ptwr_info[cpu].domain, get_current()->domain); + ptwr_info[cpu].domain, current->domain); #endif pl2e = &linear_l2_table[(page->u.inuse.type_info & PGT_va_mask) >> PGT_va_shift]; diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index 5ae8924143..5d95cdf184 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -628,7 +628,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) break; } - if ( page->count_info & PGC_guest_pinned ) + if ( page->u.inuse.type_info & PGT_pinned ) type |= LPINTAB; l_arr[j] |= type; put_page(page); diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c index ba570d91f7..a4ce55bbc0 100644 --- a/xen/common/dom_mem_ops.c +++ b/xen/common/dom_mem_ops.c @@ -82,7 +82,7 @@ static long free_dom_mem(struct domain *d, return i; } - if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_info) ) + if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) put_page_and_type(page); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index faa3b6ec3a..02a46bf1c3 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -393,10 +393,13 @@ struct pfn_info *alloc_domheap_pages(struct domain *d, int order) spin_lock(&d->page_alloc_lock); - if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) ) + if ( unlikely(test_bit(DF_DYING, &d->flags)) || + unlikely((d->tot_pages + (1 << order)) > d->max_pages) ) { DPRINTK("Over-allocation for domain %u: %u > %u\n", d->domain, d->tot_pages + (1 << order), d->max_pages); + DPRINTK("...or the domain is dying (%d)\n", + !!test_bit(DF_DYING, &d->flags)); spin_unlock(&d->page_alloc_lock); free_heap_pages(MEMZONE_DOM, pg, order); return NULL; @@ -427,6 +430,7 @@ void free_domheap_pages(struct pfn_info *pg, int order) if ( unlikely(IS_XEN_HEAP_FRAME(pg)) ) { + /* NB. May recursively lock from domain_relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) diff --git a/xen/common/schedule.c b/xen/common/schedule.c index cc06d3c085..a986ee06a1 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -374,20 +374,6 @@ void __enter_scheduler(void) cleanup_writable_pagetable( prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE); -#ifdef PTWR_TRACK_DOMAIN - { - extern domid_t ptwr_domain[]; - int cpu = smp_processor_id(); - if (ptwr_domain[cpu] != prev->domain) - printk("switch_to domain mismatch %d != %d\n", - ptwr_domain[cpu], prev->domain); - ptwr_domain[cpu] = next->domain; - if (ptwr_disconnected[cpu] != ENTRIES_PER_L2_PAGETABLE || - ptwr_writable_idx[cpu]) - printk("switch_to ptwr dirty!!!\n"); - } -#endif - perfc_incrc(sched_ctx); #if defined(WAKE_HISTO) diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 05813d64b7..944e5c3177 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -71,27 +71,27 @@ struct pfn_info /* Has this page been validated for use as its current type? */ #define _PGT_validated 28 #define PGT_validated (1<<_PGT_validated) - /* 10-bit most significant bits of va address if used as l1 page table */ -#define PGT_va_shift 18 + /* Owning guest has pinned this page to its current type? */ +#define _PGT_pinned 27 +#define PGT_pinned (1<<_PGT_pinned) + /* The 10 most significant bits of virt address if this is a page table. */ +#define PGT_va_shift 17 #define PGT_va_mask (((1<<10)-1)<count_info)) ) + else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == + (PGT_pinned | 1)) ) { - /* if the page is pinned, but we're dropping the last reference - then make the va backpointer mutable again */ + /* Page is now only pinned. Make the back pointer mutable again. */ nx |= PGT_va_mutable; } } @@ -230,33 +229,36 @@ static inline int get_page_type(struct pfn_info *page, u32 type) nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); nx |= type; /* No extra validation needed for writable pages. */ - if ( (type & PGT_type_mask) == PGT_writable_page ) + if ( type == PGT_writable_page ) nx |= PGT_validated; } } - else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) - { - DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n", - x & PGT_type_mask, type, page_to_pfn(page)); - return 0; - } - else if ( (x & PGT_va_mask) == PGT_va_mutable ) - { - /* The va_backpointer is currently mutable, hence we update it. */ - nx &= ~PGT_va_mask; - nx |= type; /* we know the actual type is correct */ - } - else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask) ) ) + else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) { - /* The va backpointer wasn't mutable, and is different :-( */ - DPRINTK("Unexpected va backpointer (saw %08x != exp %08x) for pfn %08lx\n", - x, type, page_to_pfn(page)); - return 0; + if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) + { + DPRINTK("Bad type (saw %08x != exp %08x) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( (x & PGT_va_mask) == PGT_va_mutable ) + { + /* The va backpointer is mutable, hence we update it. */ + nx &= ~PGT_va_mask; + nx |= type; /* we know the actual type is correct */ + } + else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) ) + { + /* The va backpointer wasn't mutable, and is different. */ + DPRINTK("Unexpected va backpointer (saw %08x != exp %08x)" + " for pfn %08lx\n", x, type, page_to_pfn(page)); + return 0; + } } else if ( unlikely(!(x & PGT_validated)) ) { /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) != x ) + while ( (y = page->u.inuse.type_info) == x ) { rep_nop(); barrier(); @@ -286,55 +288,6 @@ static inline int get_page_type(struct pfn_info *page, u32 type) return 1; } -/* This 'passive' version of get_page_type doesn't attempt to validate -the page, but just checks the type and increments the type count. The -function is called while doing a NORMAL_PT_UPDATE of an entry in an L1 -page table: We want to 'lock' the page for the brief beriod while -we're doing the update, but we're not actually linking it in to a -pagetable. */ - -static inline int passive_get_page_type(struct pfn_info *page, u32 type) -{ - u32 nx, x, y = page->u.inuse.type_info; - again: - do { - x = y; - nx = x + 1; - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page)); - return 0; - } - else if ( unlikely((x & PGT_count_mask) == 0) ) - { - if ( (x & (PGT_type_mask|PGT_va_mask)) != type ) - { - nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated); - nx |= type; - } - } - else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) - { - DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n", - x & PGT_type_mask, type, page_to_pfn(page)); - return 0; - } - else if ( unlikely(!(x & PGT_validated)) ) - { - /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) != x ) - { - rep_nop(); - barrier(); - } - goto again; - } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); - - return 1; -} - static inline void put_page_and_type(struct pfn_info *page) { diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h index ebac7397bd..2bae662d12 100644 --- a/xen/include/hypervisor-ifs/dom0_ops.h +++ b/xen/include/hypervisor-ifs/dom0_ops.h @@ -19,7 +19,7 @@ * This makes sure that old versions of dom0 tools will stop working in a * well-defined way (rather than crashing the machine, for instance). */ -#define DOM0_INTERFACE_VERSION 0xAAAA0013 +#define DOM0_INTERFACE_VERSION 0xAAAA0014 #define MAX_DOMAIN_NAME 16 diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h index 49cc4c46a3..c7874bd708 100644 --- a/xen/include/hypervisor-ifs/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/hypervisor-if.h @@ -149,16 +149,16 @@ #define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */ #define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */ #define MMUEXT_PIN_L4_TABLE 3 /* ptr = MA of frame to pin */ -#define MMUEXT_UNPIN_TABLE 1 /* ptr = MA of frame to unpin */ -#define MMUEXT_NEW_BASEPTR 2 /* ptr = MA of new pagetable base */ -#define MMUEXT_TLB_FLUSH 3 /* ptr = NULL */ -#define MMUEXT_INVLPG 4 /* ptr = VA to invalidate */ -#define MMUEXT_FLUSH_CACHE 5 -#define MMUEXT_SET_LDT 6 /* ptr = VA of table; val = # entries */ -#define MMUEXT_SET_FOREIGNDOM 7 /* val[31:16] = dom */ -#define MMUEXT_CLEAR_FOREIGNDOM 8 -#define MMUEXT_TRANSFER_PAGE 9 /* ptr = MA of frame; val[31:16] = dom */ -#define MMUEXT_REASSIGN_PAGE 10 +#define MMUEXT_UNPIN_TABLE 4 /* ptr = MA of frame to unpin */ +#define MMUEXT_NEW_BASEPTR 5 /* ptr = MA of new pagetable base */ +#define MMUEXT_TLB_FLUSH 6 /* ptr = NULL */ +#define MMUEXT_INVLPG 7 /* ptr = VA to invalidate */ +#define MMUEXT_FLUSH_CACHE 8 +#define MMUEXT_SET_LDT 9 /* ptr = VA of table; val = # entries */ +#define MMUEXT_SET_FOREIGNDOM 10 /* val[31:16] = dom */ +#define MMUEXT_CLEAR_FOREIGNDOM 11 +#define MMUEXT_TRANSFER_PAGE 12 /* ptr = MA of frame; val[31:16] = dom */ +#define MMUEXT_REASSIGN_PAGE 13 #define MMUEXT_CMD_MASK 255 #define MMUEXT_CMD_SHIFT 8 -- 2.30.2